# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dalex as dx
import warnings
warnings.filterwarnings('ignore')
import plotly
plotly.offline.init_notebook_mode()
pd.set_option('display.max_columns', None)
df = pd.read_csv('hotel_bookings.csv')
df.head()
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | meal | country | market_segment | distribution_channel | is_repeated_guest | previous_cancellations | previous_bookings_not_canceled | reserved_room_type | assigned_room_type | booking_changes | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | 0.0 | 0 | BB | PRT | Direct | Direct | 0 | 0 | 0 | C | C | 3 | No Deposit | NaN | NaN | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | 0.0 | 0 | BB | PRT | Direct | Direct | 0 | 0 | 0 | C | C | 4 | No Deposit | NaN | NaN | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | 0.0 | 0 | BB | GBR | Direct | Direct | 0 | 0 | 0 | A | C | 0 | No Deposit | NaN | NaN | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | 0.0 | 0 | BB | GBR | Corporate | Corporate | 0 | 0 | 0 | A | A | 0 | No Deposit | 304.0 | NaN | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | 0.0 | 0 | BB | GBR | Online TA | TA/TO | 0 | 0 | 0 | A | A | 0 | No Deposit | 240.0 | NaN | 0 | Transient | 98.0 | 0 | 1 | Check-Out | 2015-07-03 |
df.shape
(119390, 32)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 119390 entries, 0 to 119389 Data columns (total 32 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 hotel 119390 non-null object 1 is_canceled 119390 non-null int64 2 lead_time 119390 non-null int64 3 arrival_date_year 119390 non-null int64 4 arrival_date_month 119390 non-null object 5 arrival_date_week_number 119390 non-null int64 6 arrival_date_day_of_month 119390 non-null int64 7 stays_in_weekend_nights 119390 non-null int64 8 stays_in_week_nights 119390 non-null int64 9 adults 119390 non-null int64 10 children 119386 non-null float64 11 babies 119390 non-null int64 12 meal 119390 non-null object 13 country 118902 non-null object 14 market_segment 119390 non-null object 15 distribution_channel 119390 non-null object 16 is_repeated_guest 119390 non-null int64 17 previous_cancellations 119390 non-null int64 18 previous_bookings_not_canceled 119390 non-null int64 19 reserved_room_type 119390 non-null object 20 assigned_room_type 119390 non-null object 21 booking_changes 119390 non-null int64 22 deposit_type 119390 non-null object 23 agent 103050 non-null float64 24 company 6797 non-null float64 25 days_in_waiting_list 119390 non-null int64 26 customer_type 119390 non-null object 27 adr 119390 non-null float64 28 required_car_parking_spaces 119390 non-null int64 29 total_of_special_requests 119390 non-null int64 30 reservation_status 119390 non-null object 31 reservation_status_date 119390 non-null object dtypes: float64(4), int64(16), object(12) memory usage: 29.1+ MB
# Replace missing values:
# agent: If no agency is given, booking was most likely made without one.
# company: If none given, it was most likely private.
# rest schould be self-explanatory
nan_replacements = {"children:": 0.0, "country": "Unknown", "agent": 0, "company": 0}
df = df.fillna(nan_replacements)
# "meal" contains values "Undefined", which is equal to SC.
df["meal"].replace("Undefined", "SC", inplace=True)
# Some rows contain entreis with 0 adults, 0 children and 0 babies.
# I'm dropping these entries with no guests.
zero_guests = list(df.loc[df["adults"] + df["children"] + df["babies"]==0].index)
df.drop(df.index[zero_guests], inplace=True)
# feature engineering
df["adr_pp"] = df["adr"] / (df["adults"] + df["children"])
df["total_nights"] = df["stays_in_weekend_nights"] + df["stays_in_week_nights"]
# manually choose columns to include
# some columns are excluded to make the model more general and to prevent leakage
# (arrival_date_year, assigned_room_type, booking_changes, reservation_status, country,
# days_in_waiting_list, hotel)
# including the country would increase accuracy, but it may also make the model less general and make not fair
num_features = ["lead_time","arrival_date_week_number","arrival_date_day_of_month",
"stays_in_weekend_nights", "stays_in_week_nights", "total_nights",
"adults","children", "babies",
"is_repeated_guest", "previous_cancellations", "previous_bookings_not_canceled",
"agent","company",
"required_car_parking_spaces", "total_of_special_requests", "adr", "adr_pp"]
cat_features = ["arrival_date_month", "meal",
"market_segment", "distribution_channel","reserved_room_type","deposit_type","customer_type"]
# Separate features and predicted value
features = num_features + cat_features
# separate features and target
X = df.drop(["is_canceled"], axis=1)[features]
y = df["is_canceled"]
For the simplicity models were separately trained in another notebook, here we only load trained models.
import pickle
dt_model = pickle.load(open('models/dt_pipe_best.sav', 'rb'))
rf_model = pickle.load(open('models/rf_pipe_best.sav', 'rb'))
xgb_model = pickle.load(open('models/xgb_pipe_best.sav', 'rb'))
Let's select the observation for which models predictions will be explained.
idx = 8217
observation = X.loc[[idx]]
print(observation.squeeze())
lead_time 53 arrival_date_week_number 39 arrival_date_day_of_month 18 stays_in_weekend_nights 2 stays_in_week_nights 2 total_nights 4 adults 2 children 0.0 babies 0 is_repeated_guest 0 previous_cancellations 0 previous_bookings_not_canceled 0 agent 240.0 company 0.0 required_car_parking_spaces 0 total_of_special_requests 0 adr 111.75 adr_pp 55.875 arrival_date_month September meal BB market_segment Online TA distribution_channel TA/TO reserved_room_type E deposit_type No Deposit customer_type Transient Name: 8217, dtype: object
idx = 3
observation = X.loc[[idx]]
print(observation.squeeze())
dt_prediction = dt_model.predict(observation)[0]
rf_prediction = rf_model.predict(observation)[0]
xgb_prediction = xgb_model.predict(observation)[0]
dt_prob = dt_model.predict_proba(observation)[0][dt_prediction]
rf_prob = rf_model.predict_proba(observation)[0][rf_prediction]
xgb_prob = xgb_model.predict_proba(observation)[0][xgb_prediction]
print(f'Predicted value for DT model for the selected observation: {dt_prediction} ({dt_prob}), real value: {y.loc[idx]}')
print(f'Predicted value for RF model for the selected observation: {rf_prediction} ({rf_prob}), real value: {y.loc[idx]}')
print(f'Predicted value for XGB model for the selected observation: {xgb_prediction} ({xgb_prob}), real value: {y.loc[idx]}')
lead_time 13 arrival_date_week_number 27 arrival_date_day_of_month 1 stays_in_weekend_nights 0 stays_in_week_nights 1 total_nights 1 adults 1 children 0.0 babies 0 is_repeated_guest 0 previous_cancellations 0 previous_bookings_not_canceled 0 agent 304.0 company 0.0 required_car_parking_spaces 0 total_of_special_requests 0 adr 75.0 adr_pp 75.0 arrival_date_month July meal BB market_segment Corporate distribution_channel Corporate reserved_room_type A deposit_type No Deposit customer_type Transient Name: 3, dtype: object Predicted value for DT model for the selected observation: 0 (0.6829819277108434), real value: 0 Predicted value for RF model for the selected observation: 0 (0.8974463499463499), real value: 0 Predicted value for XGB model for the selected observation: 0 (0.9540161490440369), real value: 0
As we can see all models classify our observation correctly, but Decision Tree does it with the least confidence and XGBoost with the most.
Let's check how models predictions would change if the values of some explanatory variables changed. For this purpose, we can use Ceteris Paribus profiles, often also called 'what-if' plots. First let's create explainer object for each trained model.
dt_explainer = dx.Explainer(dt_model, X, y)
Preparation of a new explainer is initiated -> data : 119210 rows 25 cols -> target variable : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray. -> target variable : 119210 values -> model_class : sklearn.tree._classes.DecisionTreeClassifier (default) -> label : Not specified, model's class short name will be used. (default) -> predict function : <function yhat_proba_default at 0x000002083A729940> will be used (default) -> predict function : Accepts only pandas.DataFrame, numpy.ndarray causes problems. -> predicted values : min = 0.0, mean = 0.371, max = 1.0 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -1.0, mean = -0.00015, max = 1.0 -> model_info : package sklearn A new explainer has been created!
rf_explainer = dx.Explainer(rf_model, X, y)
Preparation of a new explainer is initiated -> data : 119210 rows 25 cols -> target variable : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray. -> target variable : 119210 values -> model_class : sklearn.ensemble._forest.RandomForestClassifier (default) -> label : Not specified, model's class short name will be used. (default) -> predict function : <function yhat_proba_default at 0x000002083A729940> will be used (default) -> predict function : Accepts only pandas.DataFrame, numpy.ndarray causes problems. -> predicted values : min = 0.0, mean = 0.374, max = 1.0 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -1.0, mean = -0.00285, max = 1.0 -> model_info : package sklearn A new explainer has been created!
xgb_explainer = dx.Explainer(xgb_model, X, y)
Preparation of a new explainer is initiated -> data : 119210 rows 25 cols -> target variable : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray. -> target variable : 119210 values -> model_class : xgboost.sklearn.XGBClassifier (default) -> label : Not specified, model's class short name will be used. (default) -> predict function : <function yhat_proba_default at 0x000002083A729940> will be used (default) -> predict function : Accepts only pandas.DataFrame, numpy.ndarray causes problems. -> predicted values : min = 1.38e-05, mean = 0.369, max = 1.0 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -0.999, mean = 0.00197, max = 1.0 -> model_info : package sklearn A new explainer has been created!
cp_dt = dt_explainer.predict_profile(observation)
cp_rf = rf_explainer.predict_profile(observation)
cp_xgb = xgb_explainer.predict_profile(observation)
Calculating ceteris paribus: 100%|█████████████████████████████████████████████████████| 25/25 [00:00<00:00, 47.19it/s] Calculating ceteris paribus: 100%|█████████████████████████████████████████████████████| 25/25 [00:08<00:00, 2.95it/s] Calculating ceteris paribus: 100%|█████████████████████████████████████████████████████| 25/25 [00:01<00:00, 20.55it/s]
cp_dt.plot(objects=[cp_rf, cp_xgb], variables=[
'lead_time', 'arrival_date_week_number',
'stays_in_weekend_nights', 'stays_in_week_nights',
'previous_cancellations', 'previous_bookings_not_canceled',
'required_car_parking_spaces', 'total_of_special_requests'
])
Looking at the Ceteris Paribus profiles, we can say that DT appears to be the least complex model and XGB most complex of them. We can deduce this looking at e.g. lead_time. In the case of the DT model, the CP profile is very simple, increasing linearly up to 14 days and then constant. On the other hand, in the case of the XGB model, it is a really complex curve without any monotonicity. For all models changing the number of previous cancellations to any non-zero value would change predictions completely. It's worth noting that the in case of the variable total_of_special_requests increasing its value to any value greater than zero for DT model would cause decrease in the probability of cancelation, while for the other two models it would cause really small, but incerese. It is also interesing to note that for all models changing the value of previous bookings that haven't been canceled has no effect on the prediction.
Let's compare Ceteris Paribus profiles for two observations - one correctly classified as canceled with absolute certainity and the other also classified correctly with high confidence, but as not canceled. For this analysis we will use XGB model.
idx_canceled = 9597
idx_not_canceled = 91306
observation_canceled = X.loc[[idx_canceled]]
observation_not_canceled = X.loc[[idx_not_canceled]]
observation_canceled.squeeze()
observation_not_canceled.squeeze()
prediction_canceled = xgb_model.predict(observation_canceled)
prediction_not_canceled = xgb_model.predict(observation_not_canceled)
prob_canceled = xgb_model.predict_proba(observation_canceled)
prob_not_canceled = xgb_model.predict_proba(observation_not_canceled)
print(f'Predicted value for xgb model for canceled observation: {prediction_canceled[0]} ({prob_canceled[0][prediction_canceled[0]]}), real value: {y.loc[idx_canceled]}')
print(f'Predicted value for xgb model for non-canceled observation: {prediction_not_canceled[0]} ({prob_not_canceled[0][prediction_canceled[0]]}), real value: {y.loc[idx_not_canceled]}')
cp_canceled = xgb_explainer.predict_profile(observation_canceled, label='canceled')
cp_not_canceled = xgb_explainer.predict_profile(observation_not_canceled, label='not canceled')
cp_canceled.plot(cp_not_canceled, variables=[
'lead_time', 'arrival_date_week_number',
'stays_in_weekend_nights', 'stays_in_week_nights',
'previous_cancellations', 'previous_bookings_not_canceled',
'required_car_parking_spaces', 'total_of_special_requests'
])
Predicted value for xgb model for canceled observation: 1 (0.9717550873756409), real value: 1 Predicted value for xgb model for non-canceled observation: 0 (0.0014058842789381742), real value: 0
Calculating ceteris paribus: 100%|█████████████████████████████████████████████████████| 25/25 [00:01<00:00, 17.62it/s] Calculating ceteris paribus: 100%|█████████████████████████████████████████████████████| 25/25 [00:01<00:00, 15.08it/s]
In the case of the canceled observation, a shortening the period between booking and arrival/cancelation date (lead time) would cause a decrease the certainty of the model prediction. However the biggest changes in the prediction could be due to noting the fact of additional requirements and car parking spaces. Changing these values to a non-zero would change the prediction completely. It is also worth noting the impact of the number of previous bookings canceled and not canceled. Changing the value of the first one would not affect the prediction, while changing the second one would cause decrease in the probability of cancelation.
When considering prediction for an observation classified as not canceled we see that the only two explanatory variables whose change would have an impact on the model prediction are the number of previously canceled reservations and the total number of special requests.
Looking at the Ceteris Paribus profiles, we can also say that prediction for the observation classified as not canceled is more stable, i.e. less sensitive to changes in values of predictors.